#coding=utf-8
import optparse
import sys
import time
import jieba
from concurrent.futures import ThreadPoolExecutor
from pyhlseg import *


def showProcess(start_time, done_count, length):
	curTime = time.time()
	tm_cost = curTime - start_time
	sys.stdout.write(u'\rtime:%d  count:%d  length:%d  speed:%.1f/秒 - %.1f万字/秒'
					 % (tm_cost, done_count, length, float(done_count)/tm_cost, float(length)/tm_cost/10000.0))

def segment_thread(text, file_result, lock, test_jieba):
	global done_count
	global done_length
	# 分词
	if not test_jieba:
		seg_result = HylandaSegment.segment(text)
		seg_text = str(seg_result.toString())
	else:
		seg_result = jieba.cut(text)
		seg_text = ' '.join(list(seg_result))
	with lock:
		if file_result is not None:
			file_result.write(seg_text)
		done_count += 1
		done_length += len(text)

def load_hlseg():
	HylandaSegment.start_jvm()
	HylandaSegment.load_dictionary(user_dict_path=HylandaSegment.BUILD_IN_USER_DICT)
	HylandaSegment.set_option(grain_size=GrainSize.LARGE)

def unload_hlseg():
	HylandaSegment.shutdown_jvm()

if __name__ == "__main__":
	global done_count
	global done_length
	#命令行解析
	usage = "usage: %prog [options] corpus\nexample: %prog /data/test_corpus.txt"
	parser = optparse.OptionParser(usage, version="%prog 0.1.0")
	parser.add_option("-o", "--output", action="store", type="string", dest="output_file", metavar="FILE", help='save result to FILE')
	parser.add_option("-t", "--thread_count", action="store", type="int", dest="thread_count", help='set the thread count', default=1)
	parser.add_option("-j", action="store_true", dest="test_jieba", default=False, help='use jieba for segment')
	(options, args) = parser.parse_args()

	#参数检查，不正确则退出
	if len(args) != 1:
		parser.print_help()
		sys.exit(1)

	if not options.test_jieba:
		load_hlseg()
	startTime = time.time()
	read_count = 0
	done_count = 0
	done_length = 0
	length = 0
	#读取文件
	with open(args[0], 'r', encoding='utf-8') as corpus, ThreadPoolExecutor(options.thread_count) as executor:
		file_result = None
		lock = threading.RLock()
		if options.output_file is not None:
			file_result = open(options.output_file, "w")
		for line in corpus:
			length += len(line)
			read_count += 1
			while read_count - done_count > 10000:
				time.sleep(1)
				showProcess(startTime, done_count, done_length)
			executor.submit(segment_thread, line, file_result, lock, options.test_jieba)
			if read_count % 10000 == 0:
				showProcess(startTime, done_count, done_length)
		executor.shutdown()
		if file_result is not None:
			file_result.close()
	showProcess(startTime, done_count, done_length)
	if not options.use_jieba:
		unload_hlseg()


